Load Libraries

library(SummarizedExperiment)
library(dplyr)
library(ggplot2)
library(knitr)
library(DT)
library(limma)
library(EnhancedVolcano)
library(fgsea)
library(msigdbr)
library(gage)
library(pathview)

Data Loading and Preparation

Data Overview

  • Paper name : Genome-wide DNA methylation patterns in pancreatic ductal adenocarcinoma reveal epigenetic deregulation of SLIT-ROBO, ITGA2 and MET signaling [https://pubmed.ncbi.nlm.nih.gov/24500968/]
  • Patient samples : 39
  • gene_expression(Microarray ): 39 patients, 20215 genes
  • LOW : 31 patients
  • HIGH : 8 patients

Load the SummarizedExperiment data object, extract clinical and expression data (log2-transformed and quantile normalized), along with annotations. Prepare the gene expression data in the Lumi package for analysis.

# Load your SE result and extract clinical data , expression data and annotation

# Load SE obj
se <- readRDS("~/BHK lab/Pancreas-cancer/output data/SE_ICGC.rds")

# Extract Clinical data 
clin <- data.frame(colData(se)) #dim is 39 x 36

# Extract the expression data
expr_quantile <- assays(se)[["gene_expression"]] #dim 20215 x 39

# Extract the annotation
annot <- data.frame(rowData(se))  #dim is 20215 x 15

# Display first few rows of the dataset
DT::datatable(round(expr_quantile[1:8, 1:5], 3)) 

Data Preparation

Preparing expression data for analysis:

Total 18208 genes remain for downstream analyses.

# Restrict to Protein-Coding Genes.
annot_proteincoding <- annot[annot$gene_type == "protein_coding",] # 18208 protein coding genes.
expr_quantile <- expr_quantile[rownames(expr_quantile) %in% rownames(annot_proteincoding),]
dim(expr_quantile)
## [1] 18208    39
DT::datatable(round(expr_quantile[1:8, 1:5], 3)) 

Differential gene expression.

# Limma approach applied.
design <- model.matrix(~ clin$low_high)
fit <- lmFit(expr_quantile , design)
fit <- eBayes(fit)

# Sort by Pvalue.
top.table <- topTable(fit, sort.by = "P", n = Inf)

# To see How many DE gene are there.
length(which(top.table$adj.P.Val < 0.05))  # 0 DE genes.
## [1] 0
length(which(top.table$P.Value < 0.05))   # 1060 DE genes
## [1] 1060
# Preview
DT::datatable(data.frame(round(data.frame(top.table),3)))
# Save top table to .txt file.
top.table$Gene <- rownames(top.table)
write.table(top.table, file = "~/BHK lab/Pancreas-cancer/output data/ICGC_Limma_Fit.txt", row.names = F, sep = "\t", quote = F)

Volcano plot

# 1. Volcano plot.

# 1.1 Volcano Plot based on P value
EnhancedVolcano(top.table,
    lab = rownames(top.table),
    x = 'logFC',
    y = 'P.Value',
    pCutoff = 0.05,
    FCcutoff = 1.5,
    xlim = c(-5, 5),
    ylim = c(0, -log10(10e-4)),
    title= 'Volcano Plot based on P value'
    )

# 1.2 Volcano Plot based on FDR 
# adjusted p-values as red 
EnhancedVolcano(top.table,
    lab = rownames(top.table),
    x = 'logFC',
    y = 'adj.P.Val',
    pCutoff = 0.05,
    FCcutoff = 1.5,
    xlim = c(-5, 5),
    ylim = c(0, -log10(10e-4)),
    legendLabels=c('Not sig.','Log FC','adj p-value',
      'adj p-value & Log FC'),
    title= 'Volcano Plot based on FDR',)

MSigDB Pathway analysis

Hallmark Geneset.

# Load gene sets from a GMT file for pathway analysis
pathwaysH <- gmtPathways("~/BHK lab/Pancreas-cancer/GSEA/h.all.v2023.2.Hs.symbols.gmt")

# 1. Create ranks

# Create 'ranks' vector from 'top.table' with logFC as values and gene names as names.
ranks <- top.table$logFC
names(ranks) <- top.table$Gene
ranks <- sort(ranks, decreasing = T)

# Preview the data table of ranks.
DT::datatable(data.frame(ranks))
# 2. Bar Plot
# Display the ranked fold changes.
barplot(ranks)

# 3. Conduct analysis
# Run fgsea with 'pathwaysH' and 'ranks'.
fgseaRes <- fgsea(pathwaysH, ranks)
fgseaRes <- na.omit(fgseaRes)

fgseaResTidy <- fgseaRes %>%
  as_tibble() %>%
  arrange(desc(NES)) #order by NES

# Combine genes in leadingEdge with semicolons
fgseaResTidy$leadingEdge <- sapply(fgseaResTidy$leadingEdge, paste, collapse = ";")
fgseaResTidy <- fgseaResTidy[order(fgseaResTidy$padj), ]

# Show in a Tidy table.
head(fgseaResTidy)
## # A tibble: 6 × 8
##   pathway                   pval     padj log2err     ES   NES  size leadingEdge
##   <chr>                    <dbl>    <dbl>   <dbl>  <dbl> <dbl> <int> <chr>      
## 1 HALLMARK_G2M_CHECKPO… 5.19e-29 2.60e-27   1.40  -0.677 -2.90   187 BUB1;TPX2;…
## 2 HALLMARK_E2F_TARGETS  1.12e-28 2.80e-27   1.39  -0.669 -2.91   194 CDCA3;AURK…
## 3 HALLMARK_MTORC1_SIGN… 2.01e-12 3.35e-11   0.899 -0.528 -2.29   194 BUB1;ACTR3…
## 4 HALLMARK_MYC_TARGETS… 3.00e-12 3.74e-11   0.899 -0.524 -2.27   192 CDC45;MCM4…
## 5 HALLMARK_TNFA_SIGNAL… 8.51e-12 7.09e-11   0.875 -0.509 -2.21   197 CXCL10;G0S…
## 6 HALLMARK_MITOTIC_SPI… 8.00e-12 7.09e-11   0.875 -0.516 -2.24   196 BUB1;TPX2;…
# Save fgseaRes as txt file.
write.table(fgseaResTidy, "~/BHK lab/Pancreas-cancer/output data/ICGC_fgseaRes_HRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)

# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy$adjPvalue <- ifelse(fgseaResTidy$padj <= 0.05, "significant", "non-significant")

# Define colors for significance: grey for non-significant and red for significant results.
cols <- c("non-significant" = "grey", "significant" = "red")

# Using ggplot: reorder pathways by NES, fill bars based on significance, flip coordinates for readability.
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES, fill = adjPvalue)) + geom_col() +
    scale_fill_manual(values = cols) + coord_flip() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    labs(x = "Pathway", y = "Normalized Enrichment Score", title = "Hallmark pathways NES
         Enrichment Score from GSEA")

# 4. GSEA Results Table Plot.
# Plot GSEA table for significant pathways (padj < 0.05) using a GSEA parameter of 0.5.
plotGseaTable(pathwaysH[fgseaRes$pathway[fgseaRes$padj < 0.05]], ranks, fgseaRes, gseaParam=0.5)

# 5.  Enrichment score plot.

# For significant pathways # 21 pathways.

# Store the list of significant pathways (padj < 0.05).
significant_pathways <- fgseaRes$pathway[fgseaRes$padj < 0.05]

# Use a for loop to iterate and plot enrichment for each significant pathway
for (i in significant_pathways) {
  enrichment_plot <- plotEnrichment(pathwaysH[[i]], ranks)
  enrichment_plot <- enrichment_plot + ggtitle(paste("Enrichment Plot for Pathway:",i))
  print(enrichment_plot)
}

# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy[fgseaResTidy$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) + 
  scale_color_gradientn(colours = c("red","blue")) + 
  labs(x='Normalized Enrichment Score', y=NULL ) + 
  theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )

KEGG Pathway Analysis

To do KEGG pathway analysis, both KEGG_MEDICUS and KEGG_LEGACY are downloaded. The cutoff of 0.05 is applied for visualization, while the text files include the results across all gene sets.

KEGG_MEDICUS

# Load KEGG_MEDICUS pathways.
pathwaysKEG <- gmtPathways("~/BHK lab/Pancreas-cancer/GSEA/c2.cp.kegg_medicus.v2023.2.Hs.symbols.gmt")

# Perform fgsea with KEGG pathways
fgseaRes_kegg <- fgsea(pathwaysKEG, ranks)

# Organize and sort KEGG results by NES
fgseaResTidy_kegg <- fgseaRes_kegg %>%
  as_tibble() %>%
  arrange(desc(NES))

# Combine genes in leadingEdge with semicolons
fgseaResTidy_kegg$leadingEdge <- sapply(fgseaResTidy_kegg$leadingEdge, paste, collapse = ";")

# Show in a Tidy table of keg.
head(fgseaResTidy_kegg)
## # A tibble: 6 × 8
##   pathway                     pval    padj log2err    ES   NES  size leadingEdge
##   <chr>                      <dbl>   <dbl>   <dbl> <dbl> <dbl> <int> <chr>      
## 1 KEGG_MEDICUS_ENV_FACTOR… 4.97e-5 0.00280   0.557 0.722  2.07    23 GSTA1;GSTM…
## 2 KEGG_MEDICUS_ENV_FACTOR… 1.51e-4 0.00468   0.519 0.737  2.04    19 GSTA1;GSTM…
## 3 KEGG_MEDICUS_REFERENCE_… 2.22e-3 0.0327    0.432 0.639  1.85    24 GSTA1;GSTM…
## 4 KEGG_MEDICUS_REFERENCE_… 7.50e-3 0.0745    0.407 0.699  1.79    14 KIF7;IHH;G…
## 5 KEGG_MEDICUS_VARIANT_MU… 4.39e-3 0.0494    0.407 0.582  1.77    29 WNT5A;KIF7…
## 6 KEGG_MEDICUS_VARIANT_MU… 4.39e-3 0.0494    0.407 0.582  1.77    29 WNT5A;KIF7…
# Save fgseaRes_keg as txt file.
write.table(fgseaResTidy_kegg, "~/BHK lab/Pancreas-cancer/output data/ICGC_fgseaRes_keggMEDRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy_kegg$adjPvalue <- ifelse(fgseaResTidy_kegg$padj <= 0.05, "significant", "non-significant")

# Create a ggplot of significant pathways, with bars colored based on adjusted p-value significance.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(reorder(pathway, NES), NES, fill = adjPvalue)) + 
  geom_col() + 
  scale_fill_manual(values = cols) + 
  coord_flip() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  labs(x = "Pathway", y = "Normalized Enrichment Score", title = "KEGG pathways NES Enrichment Score from GSEA")

# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) + 
  scale_color_gradientn(colours = c("red","blue")) + 
  labs(x='Normalized Enrichment Score', y=NULL ) + 
  theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )

## KEGG_LEGACY

# Load KEGG_MEDICUS pathways.
pathwaysKEG_LEG <- gmtPathways("~/BHK lab/Pancreas-cancer/GSEA/c2.cp.kegg_legacy.v2023.2.Hs.symbols.gmt")

# Perform fgsea with KEGG pathways
fgseaRes_kegg <- fgsea(pathwaysKEG, ranks)

# Organize and sort KEGG results by NES
fgseaResTidy_kegg <- fgseaRes_kegg %>%
  as_tibble() %>%
  arrange(desc(NES))

# Combine genes in leadingEdge with semicolons
fgseaResTidy_kegg$leadingEdge <- sapply(fgseaResTidy_kegg$leadingEdge, paste, collapse = ";")

# Show in a Tidy table of keg.
head(fgseaResTidy_kegg)
## # A tibble: 6 × 8
##   pathway                     pval    padj log2err    ES   NES  size leadingEdge
##   <chr>                      <dbl>   <dbl>   <dbl> <dbl> <dbl> <int> <chr>      
## 1 KEGG_MEDICUS_ENV_FACTOR… 6.21e-5 0.00259   0.538 0.722  2.04    23 GSTA1;GSTM…
## 2 KEGG_MEDICUS_ENV_FACTOR… 1.23e-4 0.00402   0.519 0.737  1.99    19 GSTA1;GSTM…
## 3 KEGG_MEDICUS_REFERENCE_… 1.33e-3 0.0217    0.455 0.639  1.82    24 GSTA1;GSTM…
## 4 KEGG_MEDICUS_REFERENCE_… 7.66e-3 0.0741    0.407 0.699  1.74    14 KIF7;IHH;G…
## 5 KEGG_MEDICUS_VARIANT_MU… 5.02e-3 0.0536    0.407 0.582  1.73    29 WNT5A;KIF7…
## 6 KEGG_MEDICUS_VARIANT_MU… 5.02e-3 0.0536    0.407 0.582  1.73    29 WNT5A;KIF7…
# Save fgseaRes_keg as txt file.
write.table(fgseaResTidy_kegg, "~/BHK lab/Pancreas-cancer/output data/ICGC_fgseaRes_keggLEGRank.txt", sep = "\t", row.names = FALSE, quote = FALSE)
# Mark "significant" if adjusted p-value <= 0.05, else "non-significant".
fgseaResTidy_kegg$adjPvalue <- ifelse(fgseaResTidy_kegg$padj <= 0.05, "significant", "non-significant")


# Create a ggplot of significant pathways, with bars colored based on adjusted p-value significance.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(reorder(pathway, NES), NES, fill = adjPvalue)) + 
  geom_col() + 
  scale_fill_manual(values = cols) + 
  coord_flip() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  labs(x = "Pathway", y = "Normalized Enrichment Score", title = "KEGG (legacy) pathways NES Enrichment Score from GSEA")

# Plot significant pathways as points with a gradient color based on p-value and size based on a 'size' variable.
ggplot(fgseaResTidy_kegg[fgseaResTidy_kegg$padj < 0.05, ], aes(y=reorder(pathway, size),x= NES)) + geom_point(aes(color=padj,size=size)) + 
  scale_color_gradientn(colours = c("red","blue")) + 
  labs(x='Normalized Enrichment Score', y=NULL ) + 
  theme( axis.title = element_text(face='bold'), axis.text = element_text (face='bold') )